{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "pd.set_option(\"display.max_columns\", None)\n",
    "dir = r'/Volumes/Zihao_SSD2/PatentsView/'\n",
    "fig_dir = r'/Volumes/Zihao_SSD2/PatentsView/figures/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(dir + 'regdata/reg_panel.csv', encoding = 'utf-8', low_memory=False)\n",
    "df = df.dropna(subset=['main_cpc_section_i', 'main_cpc_section_j'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Figure A1\n",
    "## Distribution of Cosine Similairity\n",
    "all_female_data = df[df['allfemale_09_100_j'] == 1]\n",
    "not_all_female_data = df[df['allfemale_09_100_j'] == 0]\n",
    "average_sim_scores_all_female = all_female_data.groupby('patent_id_i')['sim_score'].mean()\n",
    "average_sim_scores_not_all_female = not_all_female_data.groupby('patent_id_i')['sim_score'].mean()\n",
    "\n",
    "fig, axs = plt.subplots(1, 2, figsize=(12, 4))\n",
    "for ax in axs:\n",
    "    ax.spines['right'].set_visible(False)\n",
    "    ax.spines['top'].set_visible(False)\n",
    "    \n",
    "axs[0].hist(not_all_female_data['sim_score'], bins=50, alpha=0.5, label='Not All Female', color='blue', density=True)\n",
    "axs[0].hist(all_female_data['sim_score'], bins=50, alpha=0.5, label='All Female', color='red', density=True)\n",
    "axs[0].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: \"{:,}\".format(int(x/1000))))\n",
    "axs[0].set_xlabel('Cosine Similarity')\n",
    "axs[0].set_ylabel('Count (in thousands)')\n",
    "axs[0].legend()\n",
    "\n",
    "axs[1].hist(average_sim_scores_not_all_female, bins=50, alpha=0.5, label='Not All Female', color='blue', density=True)\n",
    "axs[1].hist(average_sim_scores_all_female, bins=50, alpha=0.5, label='All Female', color='red', density=True)\n",
    "axs[1].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: \"{:,}\".format(int(x/1000))))\n",
    "axs[1].set_xlabel('Mean Cosine Similarity')\n",
    "axs[1].set_ylabel('Count (in thousands)')\n",
    "axs[1].legend()\n",
    "\n",
    "plt.subplots_adjust(wspace=0.3)\n",
    "plt.savefig(fig_dir + 'figurea1.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Figure A2\n",
    "## Patent quality distribution of male vs. female\n",
    "male_data = df[df['allmale_09_100_j'] == 1]['dollar_real_log_j']\n",
    "female_data = df[df['allfemale_09_100_j'] == 1]['dollar_real_log_j']\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.hist(male_data, bins=60, alpha=0.5, label='Not All Female', color='blue', density=True)\n",
    "plt.hist(female_data, bins=60, alpha=0.5, label='All Female', color='red', density=True)\n",
    "plt.xlabel('log(KPSS Index)')\n",
    "plt.ylabel('Frequency')\n",
    "ax = plt.gca()\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "plt.legend(loc='upper right')\n",
    "plt.savefig(fig_dir + 'figurea2.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Figure A3\n",
    "## Pie/Bar chart of the omission index\n",
    "def plot_omit(df, k):\n",
    "    ## Input: omission_panel.csv\n",
    "    ## Output: pie chart and histogram\n",
    "    # Drop sim_score == 1\n",
    "    df = df[df['sim_score'] != 1]\n",
    "\n",
    "    # Group by patent_id\n",
    "    df = df.groupby('patent_id').agg(sum_omission=('omission', 'sum')).reset_index()\n",
    "    df['sum_omission'] = df['sum_omission'].apply(lambda x: k-x)\n",
    "\n",
    "    fig, axs = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True)\n",
    "\n",
    "    # Histogram\n",
    "    df_filtered = df[df['sum_omission'] != 0]\n",
    "    weights = np.ones_like(df_filtered['sum_omission']) / len(df_filtered)\n",
    "    bins = range(1, k + 2)\n",
    "    axs[0].hist(df_filtered['sum_omission'], weights=weights, bins=bins, rwidth=0.7, color='blue', alpha=0.5)\n",
    "    axs[0].set_xticks(range(1, k + 1))\n",
    "    axs[0].set_xlabel('Number of patents actually cited among the relevant list', fontsize=12)\n",
    "    axs[0].set_ylabel('Percentage', fontsize=12)\n",
    "\n",
    "    # Remove top and right spines from histogram\n",
    "    axs[0].spines['right'].set_visible(False)\n",
    "    axs[0].spines['top'].set_visible(False)\n",
    "\n",
    "    # Pie chart\n",
    "    df['agg_omission'] = df['sum_omission'].apply(lambda x: 1 if x > 0 else 0)\n",
    "    frequencies = df['agg_omission'].value_counts()\n",
    "    pie_colors = [(1, 0, 0, 0.5), (0, 0, 1, 0.5)]\n",
    "    text_properties = {'fontsize': 12}\n",
    "    axs[1].pie(frequencies, labels=['Not cited', 'Cited'], autopct='%.1f%%', colors=pie_colors, textprops=text_properties)\n",
    "\n",
    "    # Add the desired line below the pie chart\n",
    "    axs[1].set_xlabel(f'Aggregate citation of relevant prior art (k={k})', fontsize=12)\n",
    "    fig.suptitle(f'Length of Relevant Citation List k={k}', fontsize=16, y=1.1)\n",
    "\n",
    "    fig.savefig(fig_dir + f'agg_omit{k}_combined.png', bbox_inches='tight')\n",
    "    plt.close(fig)\n",
    "\n",
    "omit_panel5 = pd.read_csv(dir + 'temp/omission_panel5_robust.csv', encoding='utf-8')\n",
    "plot_omit(df=omit_panel5, k=5)\n",
    "omit_panel10 = pd.read_csv(dir + 'temp/omission_panel10_robust.csv', encoding='utf-8')\n",
    "plot_omit(df=omit_panel10, k=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Figure A4\n",
    "## Heat Map of female-inventor sorting (All Female Patents)\n",
    "df['first_letter'] = df['main_cpc_subclass_j'].str[0]\n",
    "df['two_digit'] = df['main_cpc_subclass_j'].str[1:3]\n",
    "df['third_letter'] = df['main_cpc_subclass_j'].str[3]\n",
    "df['two_digit'] = pd.to_numeric(df['two_digit'], errors='coerce').fillna(0).astype(int)\n",
    "\n",
    "# Filter out subclasses with fewer than 50 observations\n",
    "subclass_counts = df['main_cpc_subclass_j'].value_counts()\n",
    "df_filtered = df[df['main_cpc_subclass_j'].isin(subclass_counts[subclass_counts >= 50].index)]\n",
    "proportion_df = df_filtered.groupby('main_cpc_subclass_j')['allfemale_09_100_j'].mean().reset_index()\n",
    "\n",
    "sorted_df = (\n",
    "    df_filtered[['main_cpc_subclass_j', 'first_letter', 'two_digit', 'third_letter']]\n",
    "    .drop_duplicates()\n",
    "    .sort_values(by=['first_letter', 'two_digit', 'third_letter'])\n",
    "    .merge(proportion_df, on='main_cpc_subclass_j', how='left')\n",
    ")\n",
    "\n",
    "def find_factors_close_to_square(n):\n",
    "    for i in range(int(np.sqrt(n)), 0, -1):\n",
    "        if n % i == 0:\n",
    "            return i, n // i\n",
    "    return 1, n\n",
    "\n",
    "num_subclasses = len(sorted_df)\n",
    "grid_rows, grid_cols = find_factors_close_to_square(num_subclasses)\n",
    "grid_matrix = np.full((grid_rows, grid_cols), np.nan)\n",
    "\n",
    "for (idx, row) in enumerate(sorted_df.itertuples(index=False)):\n",
    "    row_idx = idx // grid_cols\n",
    "    col_idx = idx % grid_cols\n",
    "    grid_matrix[row_idx, col_idx] = row.allfemale_09_100_j\n",
    "\n",
    "plt.figure(figsize=(grid_cols / 5, grid_rows / 5))\n",
    "ax = sns.heatmap(grid_matrix, cmap=\"YlGnBu\", annot=False, cbar=True, square=True, vmax=0.3, \n",
    "                 cbar_kws={'shrink': 0.5, 'ticks': [0.00, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]})\n",
    "ax.set(xlabel='', ylabel='', xticklabels=[], yticklabels=[])\n",
    "ax.tick_params(axis='both', which='both', length=0)\n",
    "sns.despine(left=True, bottom=True, ax=ax)\n",
    "cbar = ax.collections[0].colorbar\n",
    "cbar.ax.tick_params(labelsize=5)\n",
    "cbar.set_ticks([0.00, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3])\n",
    "cbar.set_label('Proportion of All-Female Patents', rotation=270, labelpad=15, fontsize=7)\n",
    "# plt.savefig(fig_dir + 'figurea4.png', dpi=300, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Rank CPC subclasses by their female inventor representaion\n",
    "filtered_df = df.groupby('main_cpc_subclass_j').filter(lambda x: len(x) >= 50)\n",
    "proportion_all_female = filtered_df.groupby('main_cpc_subclass_j')['allfemale_09_100_j'].mean().sort_values(ascending=False)\n",
    "top_subclasses = proportion_all_female.head(15)\n",
    "print(top_subclasses)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A1\n",
    "## Gender composition of cited patent\n",
    "def gender_stats(df):\n",
    "    counts = df['genderdiscrete_09_100_j'].value_counts()\n",
    "    percentages = df['genderdiscrete_09_100_j'].value_counts(normalize=True) * 100\n",
    "    stats_df = pd.concat([counts, percentages], axis=1, keys=['Count', 'Percentage'])\n",
    "    stats_df['Percentage'] = stats_df['Percentage'].round(2)\n",
    "    stats_df.loc['Total'] = stats_df.sum()\n",
    "    stats_df.loc['Total', 'Percentage'] = 100 if stats_df.loc['Total', 'Percentage'] > 100 else stats_df.loc['Total', 'Percentage']\n",
    "    return stats_df\n",
    "\n",
    "stats_df = gender_stats(df)\n",
    "print(stats_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A1\n",
    "## Gender composition of the cited patent by firm tier\n",
    "def gender_stats_by_firmtier(df):\n",
    "    gender_discrete_counts = df.groupby(['tier_j', 'genderdiscrete_09_100_j']).size().reset_index(name='Count')\n",
    "    total_counts = df['tier_j'].value_counts().reset_index().rename(columns={'index': 'tier_j', 'tier_j': 'Total'})\n",
    "    gender_discrete_counts = pd.merge(gender_discrete_counts, total_counts, on='tier_j')\n",
    "    gender_discrete_counts['Percentage'] = (gender_discrete_counts['Count'] / gender_discrete_counts['Total']) * 100\n",
    "    return gender_discrete_counts\n",
    "\n",
    "df['tier_j'] = pd.cut(df['assignee_rank_j'], \n",
    "                      bins=[0, 20, 300, float('inf')], \n",
    "                      labels=[1, 2, 3], \n",
    "                      right=False)\n",
    "\n",
    "stats_df = gender_stats_by_firmtier(df)\n",
    "print(stats_df)\n",
    "print(df['tier_j'].value_counts(normalize=True) * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A1\n",
    "## Gender composition of cited patent by Main CPC Section\n",
    "def gender_stats_by_cpc(df):\n",
    "    df1 = df.dropna(subset=['main_cpc_section_j']) # drop rows where 'main_cpc_section_i' is NaN\n",
    "    gender_discrete_counts = df1.groupby(['main_cpc_section_j', 'genderdiscrete_09_100_j']).size().reset_index(name='Count')\n",
    "    total_counts = df1['main_cpc_section_j'].value_counts().reset_index().rename(columns={'index': 'main_cpc_section_j', 'main_cpc_section_j': 'Total'})\n",
    "    gender_discrete_counts = pd.merge(gender_discrete_counts, total_counts, on='main_cpc_section_j')\n",
    "    gender_discrete_counts['Percentage'] = (gender_discrete_counts['Count'] / gender_discrete_counts['Total']) * 100\n",
    "    return gender_discrete_counts\n",
    "\n",
    "stats_df = gender_stats_by_cpc(df)\n",
    "print(stats_df)\n",
    "print(df['main_cpc_section_j'].value_counts(normalize=True) * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A2\n",
    "## Racial composition of cited patent j\n",
    "def racial_stats(df):\n",
    "    df_filtered = df[df['assignee_country_j'] == \"US\"]\n",
    "    counts = df_filtered['racediscrete_80_j'].value_counts()\n",
    "    percentages = df_filtered['racediscrete_80_j'].value_counts(normalize=True) * 100\n",
    "    stats_df = pd.concat([counts, percentages], axis=1, keys=['Count', 'Percentage'])\n",
    "    stats_df['Percentage'] = stats_df['Percentage'].round(2)\n",
    "    stats_df.loc['Total'] = stats_df.sum()\n",
    "    stats_df.loc['Total', 'Percentage'] = 100 if stats_df.loc['Total', 'Percentage'] > 100 else stats_df.loc['Total', 'Percentage']\n",
    "    return stats_df\n",
    "\n",
    "stats_df = racial_stats(df)\n",
    "print(stats_df)\n",
    "del stats_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A2\n",
    "## Racial composition of cited patent by firm tier j\n",
    "def race_stats_by_firmtier(df):\n",
    "    df_filtered = df[df['assignee_country_j'] == 'US']\n",
    "    race_counts = df_filtered.groupby(['tier_j', 'racediscrete_80_j']).size().reset_index(name='Count')\n",
    "    total_counts = df_filtered['tier_j'].value_counts().reset_index().rename(columns={'index': 'tier_j', 'tier_j': 'Total'})\n",
    "    race_counts = pd.merge(race_counts, total_counts, on='tier_j')\n",
    "    race_counts['Percentage'] = (race_counts['Count'] / race_counts['Total']) * 100\n",
    "    print(df_filtered['tier_j'].value_counts(normalize=True) * 100)\n",
    "    return race_counts\n",
    "\n",
    "stats_df = race_stats_by_firmtier(df)\n",
    "print(stats_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A2\n",
    "## Racial composition of cited patent by Main CPC Section j\n",
    "def race_stats_by_cpc(df):\n",
    "    df_filtered = df[df['assignee_country_j'] == 'US']\n",
    "    df_filtered = df_filtered.dropna(subset=['main_cpc_section_j'])\n",
    "    print((df_filtered['main_cpc_section_j'].value_counts(normalize=True) * 100).map('{:.2f}%'.format))\n",
    "    \n",
    "    race_counts = df_filtered.groupby(['main_cpc_section_j', 'racediscrete_80_j']).size().reset_index(name='Count')\n",
    "    total_counts = df_filtered['main_cpc_section_j'].value_counts().reset_index().rename(columns={'index': 'main_cpc_section_j', 'main_cpc_section_j': 'Total'})\n",
    "    race_counts = pd.merge(race_counts, total_counts, on='main_cpc_section_j')\n",
    "    race_counts['Percentage'] = race_counts['Count'] / race_counts['Total'] * 100\n",
    "    race_counts['Percentage'] = race_counts['Percentage'].map('{:.2f}%'.format)\n",
    "    \n",
    "    return race_counts\n",
    "\n",
    "stats_df = race_stats_by_cpc(df)\n",
    "print(stats_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of the number of citations of citing patents i\n",
    "summary = df['num_citations_i'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['num_citations_i'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['num_citations_i'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of number of inventors of citing patent i\n",
    "summary = df['num_inventors_i'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['num_inventors_i'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['num_inventors_i'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of number of inventors of cited patent j\n",
    "summary = df['num_inventors_j'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['num_inventors_j'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['num_inventors_j'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of average experience of citing patent i\n",
    "summary = df['avg_experience_i'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['avg_experience_i'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['avg_experience_i'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of average experience of cited patent j\n",
    "summary = df['avg_experience_j'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['avg_experience_j'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['avg_experience_j'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of years lag\n",
    "summary = df['years_lag'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['years_lag'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['years_lag'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of KPSS Index of cited patent j\n",
    "summary = df['dollar_real_log_j'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['dollar_real_log_j'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['dollar_real_log_j'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of Cosine Similarity\n",
    "summary = df['sim_score'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['sim_score'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['sim_score'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A3\n",
    "## Summary statistics of the Omission Index\n",
    "summary = df['omission'].describe().apply(lambda x: format(x, '.2f'))\n",
    "percentile_10 = format(df['omission'].quantile(0.10), '.2f')\n",
    "percentile_90 = format(df['omission'].quantile(0.90), '.2f')\n",
    "summary = summary.append(pd.Series({'10%': percentile_10, '90%': percentile_90}))\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A4\n",
    "## Summary statistics of KPSS Index (by gender composition)\n",
    "df['genderdiscrete_j'] = df['genderdiscrete_09_100_j']\n",
    "df.loc[(df['leadfemale_09_100_j'] == 1) & (df['genderdiscrete_j'] != \"allfemale\"), 'genderdiscrete_j'] = \"leadfemale\"\n",
    "df.loc[(df['lead_gender_09_100_j'] == \"male\") & (df['genderdiscrete_j'] != \"allmale\"), 'genderdiscrete_j'] = \"leadmale\"\n",
    "df.loc[~df['genderdiscrete_j'].isin([\"allfemale\", \"allmale\", \"leadfemale\", \"leadmale\"]), 'genderdiscrete_j'] = \"other\"\n",
    "\n",
    "def custom_summary(s):\n",
    "    summary = s.describe().apply(lambda x: format(x, '.2f'))\n",
    "    percentiles = pd.Series({'10%': format(s.quantile(0.10), '.2f'), '90%': format(s.quantile(0.90), '.2f')})\n",
    "    summary = pd.concat([summary, percentiles])\n",
    "    return summary\n",
    "\n",
    "grouped_summary = df.groupby('genderdiscrete_j')['dollar_real_log_j'].apply(custom_summary).unstack()\n",
    "print(grouped_summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A5\n",
    "## Summary Statistics of Cosine Similarity (by gender composition)\n",
    "df['genderdiscrete_j'] = df['genderdiscrete_09_100_j']\n",
    "df.loc[(df['leadfemale_09_100_j'] == 1) & (df['genderdiscrete_j'] != \"allfemale\"), 'genderdiscrete_j'] = \"leadfemale\"\n",
    "df.loc[(df['lead_gender_09_100_j'] == \"male\") & (df['genderdiscrete_j'] != \"allmale\"), 'genderdiscrete_j'] = \"leadmale\"\n",
    "df.loc[~df['genderdiscrete_j'].isin([\"allfemale\", \"allmale\", \"leadfemale\", \"leadmale\"]), 'genderdiscrete_j'] = \"other\"\n",
    "\n",
    "def custom_summary(s):\n",
    "    summary = s.describe().apply(lambda x: format(x, '.2f'))\n",
    "    percentiles = pd.Series({'10%': format(s.quantile(0.10), '.2f'), '90%': format(s.quantile(0.90), '.2f')})\n",
    "    summary = pd.concat([summary, percentiles])\n",
    "    return summary\n",
    "\n",
    "grouped_summary = df.groupby('genderdiscrete_j')['sim_score'].apply(custom_summary).unstack()\n",
    "print(grouped_summary)\n",
    "df.drop(columns=['genderdiscrete_j'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A6\n",
    "## Gender share of most similar patent, second most similar patent, ...\n",
    "df['genderdiscrete_j'] = df['genderdiscrete_09_100_j']\n",
    "df.loc[(df['leadfemale_09_100_j'] == 1) & (df['genderdiscrete_j'] != \"allfemale\"), 'genderdiscrete_j'] = \"leadfemale\"\n",
    "df.loc[(df['lead_gender_09_100_j'] == \"male\") & (df['genderdiscrete_j'] != \"allmale\"), 'genderdiscrete_j'] = \"leadmale\"\n",
    "df.loc[~df['genderdiscrete_j'].isin([\"allfemale\", \"allmale\", \"leadfemale\", \"leadmale\"]), 'genderdiscrete_j'] = \"other\"\n",
    "\n",
    "df_sorted = df[['patent_id_i', 'patent_id_j', 'sim_score', 'genderdiscrete_j']]\n",
    "df_sorted = df_sorted.sort_values(by=['patent_id_i', 'sim_score'], ascending=[True, False])\n",
    "df_sorted['count'] = df_sorted.groupby('patent_id_i').cumcount() + 1\n",
    "grouped = df_sorted.groupby('count')['genderdiscrete_j'].value_counts(normalize=True).mul(100)\n",
    "percentages = grouped.unstack().fillna(0)\n",
    "print(percentages.to_string(float_format=\"{:0.2f}\".format))\n",
    "del df_sorted, grouped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A7\n",
    "## Levels of collaboration rates, own- and across-gender\n",
    "df_inv = pd.read_csv(dir + 'cleandata/g_inventor_gender_race_age.csv', low_memory=False, usecols=['patent_id', 'inventor_id', 'inventor_name', 'gender_09_100', 'race80', 'patent_year', 'assignee_id', 'assignee_country'])\n",
    "df_inv = df_inv[~df_inv[\"patent_id\"].str.contains(\"[a-zA-Z]\")]\n",
    "df_inv[\"patent_id\"] = pd.to_numeric(df_inv[\"patent_id\"], errors=\"raise\")\n",
    "df_inv = df_inv.sort_values(['patent_id'], ascending=[True])\n",
    "df_inv = df_inv[(df_inv['patent_year'] >= 1981) & (df_inv['patent_year'] <= 2015)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              male_collab_rates female_collab_rates\n",
      "                           mean                mean\n",
      "gender_09_100                                      \n",
      "ambiguous              0.849629            0.150371\n",
      "female                 0.820478            0.179522\n",
      "male                   0.917369            0.082631\n"
     ]
    }
   ],
   "source": [
    "# Table A7 (cont.)\n",
    "df_collab = df_inv[['inventor_id', 'patent_id', 'gender_09_100']]\n",
    "df_pairs = pd.merge(df_collab, df_collab, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs = df_pairs[df_pairs['inventor_id_1'] != df_pairs['inventor_id_2']]\n",
    "\n",
    "df_counts = df_pairs.groupby('inventor_id_1').agg({\n",
    "    'gender_09_100_2': [\n",
    "        ('num_male_collab', lambda x: x.eq('male').sum()),\n",
    "        ('num_female_collab', lambda x: x.eq('female').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts.columns = ['_'.join(col) for col in df_counts.columns]\n",
    "\n",
    "df_counts['total_collaborators'] = df_counts['gender_09_100_2_num_male_collab'] + df_counts['gender_09_100_2_num_female_collab'] + df_counts['gender_09_100_2_num_ambig_collab']\n",
    "df_counts['total_collaborators_noambig'] = df_counts['gender_09_100_2_num_male_collab'] + df_counts['gender_09_100_2_num_female_collab']\n",
    "\n",
    "df_counts = df_counts.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts = pd.merge(df_counts,\n",
    "                    df_collab[['inventor_id', 'gender_09_100']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts = df_counts.rename(columns={\n",
    "    'gender_09_100_2_num_male_collab': 'num_male_collab',\n",
    "    'gender_09_100_2_num_female_collab': 'num_female_collab',\n",
    "    'gender_09_100_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts['male_collab_rates'] = df_counts['num_male_collab'] / df_counts['total_collaborators_noambig']\n",
    "df_counts['female_collab_rates'] = df_counts['num_female_collab'] / df_counts['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_gender = df_counts.groupby('gender_09_100')[['male_collab_rates', 'female_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_gender)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "## By firm all female patent counts / rates\n",
    "\n",
    "df_assignee_inv_gen = df_inv.groupby(['assignee_id','patent_id'])['gender_09_100'].apply(lambda x: 'AllFemale' if set(x)=={'female'} else 'Other').reset_index()\n",
    "assert df_assignee_inv_gen['gender_09_100'].isna().sum() == 0\n",
    "\n",
    "df_assignee_gen = df_assignee_inv_gen.groupby(['assignee_id'])['patent_id'].nunique().reset_index().rename(columns = {'patent_id':'NumPatent'})\n",
    "df_assignee_gen_female = df_assignee_inv_gen[df_assignee_inv_gen['gender_09_100']=='AllFemale'].groupby(['assignee_id'])['patent_id'].nunique().reset_index().rename(columns = {'patent_id':'NumAllFemalePatent'})\n",
    "df_assignee_gen_other = df_assignee_inv_gen[df_assignee_inv_gen['gender_09_100']=='Other'].groupby(['assignee_id'])['patent_id'].nunique().reset_index().rename(columns = {'patent_id':'NumOtherPatent'})\n",
    "\n",
    "df_assignee_gen2 = pd.merge(\n",
    "    left = df_assignee_gen,\n",
    "    right = df_assignee_gen_female,\n",
    "    on = 'assignee_id',\n",
    "    how = 'left'\n",
    "    )\n",
    "assert len(df_assignee_gen2) == len(df_assignee_gen)\n",
    "\n",
    "df_assignee_gen3 = pd.merge(\n",
    "    left = df_assignee_gen2,\n",
    "    right = df_assignee_gen_other,\n",
    "    on = 'assignee_id',\n",
    "    how = 'left'\n",
    "    )\n",
    "assert len(df_assignee_gen3) == len(df_assignee_gen2)\n",
    "\n",
    "df_assignee_gen3 = df_assignee_gen3.fillna(0)\n",
    "for col in ['NumPatent','NumAllFemalePatent','NumOtherPatent']:\n",
    "    df_assignee_gen3[col] = df_assignee_gen3[col].astype(int)\n",
    "assert (df_assignee_gen3['NumPatent'] != df_assignee_gen3['NumAllFemalePatent'] + df_assignee_gen3['NumOtherPatent']).sum() == 0\n",
    "\n",
    "df_assignee_gen3['AllFemalPatentPct'] = df_assignee_gen3['NumAllFemalePatent']/df_assignee_gen3['NumPatent']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separate assignees to 2 groups, in terms of %\n",
    "\n",
    "assignees_partA = set(df_assignee_gen3[\n",
    "    # (df_assignee_gen3['AllFemalPatentPct']>0)\n",
    "    (df_assignee_gen3['AllFemalPatentPct']<=0.5)\n",
    "    ]['assignee_id'].unique())\n",
    "assignees_partB = set(df_assignee_gen3[\n",
    "    (df_assignee_gen3['AllFemalPatentPct']>0.5)\n",
    "    # & (df_assignee_gen3['AllFemalPatentPct']<1)\n",
    "    ]['assignee_id'].unique())\n",
    "assert len(assignees_partA.intersection(assignees_partB)) == 0\n",
    "assert len(assignees_partA) + len(assignees_partB) == df_assignee_gen3['assignee_id'].nunique(2\n",
    "\n",
    "print(df_assignee_gen3['assignee_id'].nunique())\n",
    "print(len(assignees_partA))\n",
    "print(len(assignees_partB))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group A\n",
    "\n",
    "df_collab_partA = df_inv[df_inv['assignee_id'].isin(assignees_partA)][['inventor_id', 'patent_id', 'gender_09_100']]\n",
    "df_pairs_partA = pd.merge(df_collab_partA, df_collab_partA, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs_partA = df_pairs_partA[df_pairs_partA['inventor_id_1'] != df_pairs_partA['inventor_id_2']]\n",
    "\n",
    "df_counts_partA = df_pairs_partA.groupby('inventor_id_1').agg({\n",
    "    'gender_09_100_2': [\n",
    "        ('num_male_collab', lambda x: x.eq('male').sum()),\n",
    "        ('num_female_collab', lambda x: x.eq('female').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts_partA.columns = ['_'.join(col) for col in df_counts_partA.columns]\n",
    "\n",
    "df_counts_partA['total_collaborators'] = df_counts_partA['gender_09_100_2_num_male_collab'] + df_counts_partA['gender_09_100_2_num_female_collab'] + df_counts_partA['gender_09_100_2_num_ambig_collab']\n",
    "df_counts_partA['total_collaborators_noambig'] = df_counts_partA['gender_09_100_2_num_male_collab'] + df_counts_partA['gender_09_100_2_num_female_collab']\n",
    "\n",
    "df_counts_partA = df_counts_partA.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts_partA = pd.merge(df_counts_partA,\n",
    "                    df_collab_partA[['inventor_id', 'gender_09_100']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts_partA = df_counts_partA.rename(columns={\n",
    "    'gender_09_100_2_num_male_collab': 'num_male_collab',\n",
    "    'gender_09_100_2_num_female_collab': 'num_female_collab',\n",
    "    'gender_09_100_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts_partA['male_collab_rates'] = df_counts_partA['num_male_collab'] / df_counts_partA['total_collaborators_noambig']\n",
    "df_counts_partA['female_collab_rates'] = df_counts_partA['num_female_collab'] / df_counts_partA['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_gender_partA = df_counts_partA.groupby('gender_09_100')[['male_collab_rates', 'female_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_gender_partA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group B\n",
    "df_collab_partB = df_inv[df_inv['assignee_id'].isin(assignees_partB)][['inventor_id', 'patent_id', 'gender_09_100']]\n",
    "df_pairs_partB = pd.merge(df_collab_partB, df_collab_partB, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs_partB = df_pairs_partB[df_pairs_partB['inventor_id_1'] != df_pairs_partB['inventor_id_2']]\n",
    "\n",
    "df_counts_partB = df_pairs_partB.groupby('inventor_id_1').agg({\n",
    "    'gender_09_100_2': [\n",
    "        ('num_male_collab', lambda x: x.eq('male').sum()),\n",
    "        ('num_female_collab', lambda x: x.eq('female').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts_partB.columns = ['_'.join(col) for col in df_counts_partB.columns]\n",
    "\n",
    "df_counts_partB['total_collaborators'] = df_counts_partB['gender_09_100_2_num_male_collab'] + df_counts_partB['gender_09_100_2_num_female_collab'] + df_counts_partB['gender_09_100_2_num_ambig_collab']\n",
    "df_counts_partB['total_collaborators_noambig'] = df_counts_partB['gender_09_100_2_num_male_collab'] + df_counts_partB['gender_09_100_2_num_female_collab']\n",
    "\n",
    "df_counts_partB = df_counts_partB.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts_partB = pd.merge(df_counts_partB,\n",
    "                    df_collab_partB[['inventor_id', 'gender_09_100']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts_partB = df_counts_partB.rename(columns={\n",
    "    'gender_09_100_2_num_male_collab': 'num_male_collab',\n",
    "    'gender_09_100_2_num_female_collab': 'num_female_collab',\n",
    "    'gender_09_100_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts_partB['male_collab_rates'] = df_counts_partB['num_male_collab'] / df_counts_partB['total_collaborators_noambig']\n",
    "df_counts_partB['female_collab_rates'] = df_counts_partB['num_female_collab'] / df_counts_partB['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_gender_partB = df_counts_partB.groupby('gender_09_100')[['male_collab_rates', 'female_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_gender_partB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separate assignees to 2 groups, in terms of #\n",
    "\n",
    "assignees_partC = set(df_assignee_gen3[\n",
    "    (df_assignee_gen3['NumAllFemalePatent']<=20)\n",
    "    ]['assignee_id'].unique())\n",
    "assignees_partD = set(df_assignee_gen3[\n",
    "    (df_assignee_gen3['NumAllFemalePatent']>20)\n",
    "    ]['assignee_id'].unique())\n",
    "assert len(assignees_partC.intersection(assignees_partD)) == 0\n",
    "assert len(assignees_partC) + len(assignees_partD) == df_assignee_gen3['assignee_id'].nunique()\n",
    "\n",
    "print(df_assignee_gen3['assignee_id'].nunique())\n",
    "print(len(assignees_partC))\n",
    "print(len(assignees_partD))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group C\n",
    "\n",
    "df_collab_partC = df_inv[df_inv['assignee_id'].isin(assignees_partC)][['inventor_id', 'patent_id', 'gender_09_100']]\n",
    "df_pairs_partC = pd.merge(df_collab_partC, df_collab_partC, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs_partC = df_pairs_partC[df_pairs_partC['inventor_id_1'] != df_pairs_partC['inventor_id_2']]\n",
    "\n",
    "df_counts_partC = df_pairs_partC.groupby('inventor_id_1').agg({\n",
    "    'gender_09_100_2': [\n",
    "        ('num_male_collab', lambda x: x.eq('male').sum()),\n",
    "        ('num_female_collab', lambda x: x.eq('female').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts_partC.columns = ['_'.join(col) for col in df_counts_partC.columns]\n",
    "\n",
    "df_counts_partC['total_collaborators'] = df_counts_partC['gender_09_100_2_num_male_collab'] + df_counts_partC['gender_09_100_2_num_female_collab'] + df_counts_partC['gender_09_100_2_num_ambig_collab']\n",
    "df_counts_partC['total_collaborators_noambig'] = df_counts_partC['gender_09_100_2_num_male_collab'] + df_counts_partC['gender_09_100_2_num_female_collab']\n",
    "\n",
    "df_counts_partC = df_counts_partC.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts_partC = pd.merge(df_counts_partC,\n",
    "                    df_collab_partC[['inventor_id', 'gender_09_100']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts_partC = df_counts_partC.rename(columns={\n",
    "    'gender_09_100_2_num_male_collab': 'num_male_collab',\n",
    "    'gender_09_100_2_num_female_collab': 'num_female_collab',\n",
    "    'gender_09_100_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts_partC['male_collab_rates'] = df_counts_partC['num_male_collab'] / df_counts_partC['total_collaborators_noambig']\n",
    "df_counts_partC['female_collab_rates'] = df_counts_partC['num_female_collab'] / df_counts_partC['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_gender_partC = df_counts_partC.groupby('gender_09_100')[['male_collab_rates', 'female_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_gender_partC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group D\n",
    "\n",
    "df_collab_partD = df_inv[df_inv['assignee_id'].isin(assignees_partD)][['inventor_id', 'patent_id', 'gender_09_100']]\n",
    "df_pairs_partD = pd.merge(df_collab_partD, df_collab_partD, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs_partD = df_pairs_partD[df_pairs_partD['inventor_id_1'] != df_pairs_partD['inventor_id_2']]\n",
    "\n",
    "df_counts_partD = df_pairs_partD.groupby('inventor_id_1').agg({\n",
    "    'gender_09_100_2': [\n",
    "        ('num_male_collab', lambda x: x.eq('male').sum()),\n",
    "        ('num_female_collab', lambda x: x.eq('female').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts_partD.columns = ['_'.join(col) for col in df_counts_partD.columns]\n",
    "\n",
    "df_counts_partD['total_collaborators'] = df_counts_partD['gender_09_100_2_num_male_collab'] + df_counts_partD['gender_09_100_2_num_female_collab'] + df_counts_partD['gender_09_100_2_num_ambig_collab']\n",
    "df_counts_partD['total_collaborators_noambig'] = df_counts_partD['gender_09_100_2_num_male_collab'] + df_counts_partD['gender_09_100_2_num_female_collab']\n",
    "\n",
    "df_counts_partD = df_counts_partD.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts_partD = pd.merge(df_counts_partD,\n",
    "                    df_collab_partD[['inventor_id', 'gender_09_100']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts_partD = df_counts_partD.rename(columns={\n",
    "    'gender_09_100_2_num_male_collab': 'num_male_collab',\n",
    "    'gender_09_100_2_num_female_collab': 'num_female_collab',\n",
    "    'gender_09_100_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts_partD['male_collab_rates'] = df_counts_partD['num_male_collab'] / df_counts_partD['total_collaborators_noambig']\n",
    "df_counts_partD['female_collab_rates'] = df_counts_partD['num_female_collab'] / df_counts_partD['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_gender_partD = df_counts_partD.groupby('gender_09_100')[['male_collab_rates', 'female_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_gender_partD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Table A8\n",
    "## Levels of collaboration rates, own- and across-race\n",
    "df_inv_us = df_inv[df_inv['assignee_country']=='US']\n",
    "df_collab = df_inv_us[['inventor_id', 'patent_id', 'race80']]\n",
    "df_pairs = pd.merge(df_collab, df_collab, on='patent_id', suffixes=('_1', '_2'))\n",
    "df_pairs = df_pairs[df_pairs['inventor_id_1'] != df_pairs['inventor_id_2']]\n",
    "\n",
    "df_counts = df_pairs.groupby('inventor_id_1').agg({\n",
    "    'race80_2': [\n",
    "        ('num_asian_collab', lambda x: x.eq('asian').sum()),\n",
    "        ('num_black_collab', lambda x: x.eq('nh_black').sum()),\n",
    "        ('num_hispanic_collab', lambda x: x.eq('hispanic').sum()),\n",
    "        ('num_white_collab', lambda x: x.eq('nh_white').sum()),\n",
    "        ('num_ambig_collab', lambda x: x.eq('ambiguous').sum())\n",
    "    ]\n",
    "}).reset_index()\n",
    "df_counts.columns = ['_'.join(col) for col in df_counts.columns]\n",
    "\n",
    "df_counts['total_collaborators'] = df_counts['race80_2_num_asian_collab'] + df_counts['race80_2_num_black_collab'] + df_counts['race80_2_num_hispanic_collab'] + df_counts['race80_2_num_white_collab'] + df_counts['race80_2_num_ambig_collab']\n",
    "df_counts['total_collaborators_noambig'] = df_counts['race80_2_num_asian_collab'] + df_counts['race80_2_num_black_collab'] + df_counts['race80_2_num_hispanic_collab'] + df_counts['race80_2_num_white_collab']\n",
    "\n",
    "df_counts = df_counts.rename(columns={'inventor_id_1_': 'inventor_id'})\n",
    "df_counts = pd.merge(df_counts,\n",
    "                    df_collab[['inventor_id', 'race80']].drop_duplicates(),\n",
    "                    on='inventor_id', \n",
    "                    how='left'\n",
    "                    )\n",
    "\n",
    "df_counts = df_counts.rename(columns={\n",
    "    'race80_2_num_asian_collab': 'num_asian_collab',\n",
    "    'race80_2_num_black_collab': 'num_black_collab',\n",
    "    'race80_2_num_hispanic_collab': 'num_hispanic_collab',\n",
    "    'race80_2_num_white_collab': 'num_white_collab',\n",
    "    'race80_2_num_ambig_collab': 'num_ambig_collab'\n",
    "})\n",
    "df_counts['asian_collab_rates'] = df_counts['num_asian_collab'] / df_counts['total_collaborators_noambig']\n",
    "df_counts['black_collab_rates'] = df_counts['num_black_collab'] / df_counts['total_collaborators_noambig']\n",
    "df_counts['hispanic_collab_rates'] = df_counts['num_hispanic_collab'] / df_counts['total_collaborators_noambig']\n",
    "df_counts['white_collab_rates'] = df_counts['num_white_collab'] / df_counts['total_collaborators_noambig']\n",
    "\n",
    "collab_rate_race = df_counts.groupby('race80')[['asian_collab_rates', 'black_collab_rates', 'hispanic_collab_rates', 'white_collab_rates']].agg(['mean'])\n",
    "print(collab_rate_race)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Figure 1\n",
    "## Inventor Network Effect (Distance between lead inventors)\n",
    "## Including Mixed-gendered patents\n",
    "\n",
    "# Estimates are copied from run_reg.do\n",
    "data = pd.DataFrame({\n",
    "    'male_coef': [.0008606, -.0044389, -.0152458, -.0423923, -.086176, -.1414269],\n",
    "    'male_se': [.0004686, .0005546, .0007119, .0008431, .0008676, .0008541],\n",
    "    'inter_coef': [.005051, .007294, .0050753, .0211703, .0365683, .0572035],\n",
    "    'inter_se': [.003251, .0035575, .0051951, .0058268, .0066169, .0075895],\n",
    "    'cov': [-1.773e-07, -2.073e-07, -3.775e-07, -5.580e-07, -5.617e-07, -6.039e-07],\n",
    "    'distance': [5, 4, 3, 2, 1, 0]\n",
    "})\n",
    "data['female_coef'] = data['male_coef'] + data['inter_coef']\n",
    "data['female_se'] = np.sqrt(data['male_se']**2 + data['inter_se']**2 + 2*data['cov'])\n",
    "data['male_lb'] = data['male_coef'] - 1.96 * data['male_se']\n",
    "data['male_ub'] = data['male_coef'] + 1.96 * data['male_se']\n",
    "data['female_lb'] = data['female_coef'] - 1.96 * data['female_se']\n",
    "data['female_ub'] = data['female_coef'] + 1.96 * data['female_se']\n",
    "\n",
    "# Plotting\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.errorbar(data['distance'], data['male_coef'], yerr=1.96*data['male_se'], fmt='k-', lw=0.8, capsize=3, label='Not All Female')\n",
    "ax.errorbar(data['distance'], data['female_coef'], yerr=1.96*data['female_se'], fmt='k--', lw=0.8, capsize=3, dashes=[5, 2], label='All Female')\n",
    "ax.axhline(0, color='gray', linewidth=0.5, linestyle='--')\n",
    "ax.set_xlabel('Patent Distance', fontsize=12)\n",
    "ax.set_ylabel('Coefficient Estimate', fontsize=12)\n",
    "ax.legend(loc='lower right', fontsize=10)\n",
    "plt.savefig(fig_dir + 'figure1.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Figure B1\n",
    "## Inventor Network Effect (node_dist_first)\n",
    "## No Mixed-gendered patents\n",
    "\n",
    "# Estimates are copied from run_reg.do\n",
    "data = pd.DataFrame({\n",
    "    'male_coef': [.000708, -.00593, -.0165123, -.044014, -.0899064, -.138745],\n",
    "    'male_se': [.0006371, .000748, .0009554, .0011467, .0011473, .0010476],\n",
    "    'inter_coef': [.0073146, .0110938, .0073715, .0258917, .0532918, .0773322],\n",
    "    'inter_se': [.0038224, .0042254, .0067003, .0074534, .0093498, .0110969],\n",
    "    'cov': [-3.265e-07, -4.022e-07, -7.153e-07, 1.315e-06, 1.316e-06, 1.097e-06],\n",
    "    'distance': [5, 4, 3, 2, 1, 0]\n",
    "})\n",
    "\n",
    "data['female_coef'] = data['male_coef'] + data['inter_coef']\n",
    "data['female_se'] = np.sqrt(data['male_se']**2 + data['inter_se']**2 + 2*data['cov'])\n",
    "data['male_lb'] = data['male_coef'] - 1.96 * data['male_se']\n",
    "data['male_ub'] = data['male_coef'] + 1.96 * data['male_se']\n",
    "data['female_lb'] = data['female_coef'] - 1.96 * data['female_se']\n",
    "data['female_ub'] = data['female_coef'] + 1.96 * data['female_se']\n",
    "\n",
    "# Plotting\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.errorbar(data['distance'], data['male_coef'], yerr=1.96*data['male_se'], fmt='k-', lw=0.8, capsize=3, label='All Male')\n",
    "ax.errorbar(data['distance'], data['female_coef'], yerr=1.96*data['female_se'], fmt='k--', lw=0.8, capsize=3, dashes=[5, 2], label='All Female')\n",
    "ax.axhline(0, color='gray', linewidth=0.5, linestyle='--')\n",
    "ax.set_xlabel('Patent Distance', fontsize=12)\n",
    "ax.set_ylabel('Coefficient Estimate', fontsize=12)\n",
    "ax.legend(loc='lower right', fontsize=10)\n",
    "plt.savefig(fig_dir + 'figureb1.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Figure B2\n",
    "## Inventor Network Effect (node_dist_all)\n",
    "## Including Mixed-gendered patents\n",
    "\n",
    "# Estimates are from run_reg_cosine.do\n",
    "data = pd.DataFrame({\n",
    "    'male_coef': [-.0019382, -.0068907, -.0132274, -.0289234, -.0542335, -.1422722],\n",
    "    'male_se': [.0004398, .0004989, .000604, .0007798, .0008468, .0007157],\n",
    "    'inter_coef': [.007205, .0056541, .0039707, .006897, .0034625, .0505136],\n",
    "    'inter_se': [.0029746, .0032148, .0045506, .0055173, .0059968, .0061717],\n",
    "    'cov': [-1.556e-07, -1.695e-07, -2.422e-07, 6.080e-07, 7.171e-07, 5.122e-07],\n",
    "    'distance': [5, 4, 3, 2, 1, 0]\n",
    "})\n",
    "data['female_coef'] = data['male_coef'] + data['inter_coef']\n",
    "data['female_se'] = np.sqrt(data['male_se']**2 + data['inter_se']**2 + 2*data['cov'])\n",
    "data['male_lb'] = data['male_coef'] - 1.96 * data['male_se']\n",
    "data['male_ub'] = data['male_coef'] + 1.96 * data['male_se']\n",
    "data['female_lb'] = data['female_coef'] - 1.96 * data['female_se']\n",
    "data['female_ub'] = data['female_coef'] + 1.96 * data['female_se']\n",
    "\n",
    "# Plotting\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "ax.errorbar(data['distance'], data['male_coef'], yerr=1.96*data['male_se'], fmt='k-', lw=0.8, capsize=3, label='Not All Female')\n",
    "ax.errorbar(data['distance'], data['female_coef'], yerr=1.96*data['female_se'], fmt='k--', lw=0.8, capsize=3, dashes=[5, 2], label='All Female')\n",
    "ax.axhline(0, color='gray', linewidth=0.5, linestyle='--')\n",
    "ax.set_xlabel('Patent Distance', fontsize=12)\n",
    "ax.set_ylabel('Coefficient Estimate', fontsize=12)\n",
    "ax.legend(fontsize=10)\n",
    "plt.savefig(fig_dir + 'figureb2.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
